import sys
import os

#takes mallet ouput and creates docbytopic matrix
#(with topics in numerical order)
#command line arguments
#1) input file
#2) output file

def segmentInput(line):
	#parses the mallet docByTopic output file

	#dictionary of topic probabilities:
	topicProbs = {}
	i = 0

	#the line from the input parsed into "words"
	tps = line.split()
	for i in xrange(len(tps)):
		c = tps[i]

		#get rid of doc number
		if i == 0:
			docNum = c

		#pull out doc name:
		elif i == 1:
			docName = c

		#pull out each top/pct pair.
		elif i%2 == 1:
			top = tps[i-1]
			pct = tps[i]
			topicProbs[top] = pct
	return [docName,topicProbs]			

input = sys.argv[1]
output = sys.argv[2]

inputFile = open(input)
outputFile = open(output,"w")

first = True
second = False
it=0
for line in inputFile:
	#prints out a line number every 100 iterations so I know it's working:
	if it%100 == 0:
		print it
	it+=1

	#the first line contains a mallet header that we don't want.
	#here we throw it away
	if first:
		first = False
		second = True

	else:
		lineStats = segmentInput(line)
		docName = lineStats[0]
		year = docName.split("/")[8]
		year = year.split("_")[1]
		topicProbs = lineStats[1]

		#when we reach the first content line, write a header line
		#with "document" over the doc names and the topic number
		#over each topic column:
		if second:
			writeString = "Document,"
			for i in xrange(0,len(topicProbs)):
				strI = str(i)
				writeString += strI + ","
			writeString = writeString[:-1] + "\n"
			outputFile.write(writeString)
			second = False

		#write the topic probabilities in order:
		writeString = docName + ","
		for i in xrange(0,len(topicProbs)):
			strI=str(i)
			writeString += topicProbs[strI] + ","
		writeString = writeString[:-1] + "\n"
		outputFile.write(writeString)
	
inputFile.close()
outputFile.close()	
